{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3f9e34ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "!BUILD_CUDA_EXT=0 pip install -q auto-gptq transformers\n",
    "\n",
    "import random\n",
    "\n",
    "from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig\n",
    "from datasets import load_dataset\n",
    "import torch\n",
    "from transformers import AutoTokenizer\n",
    "\n",
    "\n",
    "# Define base model and output directory\n",
    "model_id = \"./gpt2\"\n",
    "out_dir = model_id + \"-GPTQ\"\n",
    "\n",
    "# Load quantize config, model and tokenizer\n",
    "quantize_config = BaseQuantizeConfig(\n",
    "    bits=4,\n",
    "    group_size=128,\n",
    "    damp_percent=0.01,\n",
    "    desc_act=False,\n",
    ")\n",
    "model = AutoGPTQForCausalLM.from_pretrained(model_id, quantize_config)\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
    "\n",
    "\n",
    "# Load data and tokenize examples\n",
    "n_samples = 1024\n",
    "data = load_dataset(\"./c4\", data_files=\"en/c4-train.00001-of-01024.json.gz\", split=f\"train[:{n_samples*5}]\")\n",
    "tokenized_data = tokenizer(\"\\n\\n\".join(data['text']), return_tensors='pt')\n",
    "\n",
    "# Format tokenized examples\n",
    "examples_ids = []\n",
    "for _ in range(n_samples):\n",
    "    i = random.randint(0, tokenized_data.input_ids.shape[1] - tokenizer.model_max_length - 1)\n",
    "    j = i + tokenizer.model_max_length\n",
    "    input_ids = tokenized_data.input_ids[:, i:j]\n",
    "    attention_mask = torch.ones_like(input_ids)\n",
    "    examples_ids.append({'input_ids': input_ids, 'attention_mask': attention_mask})\n",
    "    \n",
    "    \n",
    "# %%time\n",
    "\n",
    "# Quantize with GPTQ\n",
    "model.quantize(\n",
    "    examples_ids,\n",
    "    batch_size=1,\n",
    "    use_triton=True,\n",
    ")\n",
    "\n",
    "# Save model and tokenizer\n",
    "model.save_quantized(out_dir, use_safetensors=True)\n",
    "tokenizer.save_pretrained(out_dir)\n",
    "\n",
    "print(\"量化结束，且保存完成，开始尝试测试量化模型！！！\")\n",
    "\n",
    "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
    "\n",
    "# Reload model and tokenizer\n",
    "model = AutoGPTQForCausalLM.from_quantized(\n",
    "    out_dir,\n",
    "    device=device,\n",
    "    use_triton=True,\n",
    "    use_safetensors=True,\n",
    ")\n",
    "tokenizer = AutoTokenizer.from_pretrained(out_dir)\n",
    "\n",
    "# 这里似乎不能用pipline，只能用.generate\n",
    "# from transformers import pipeline\n",
    "\n",
    "# generator = pipeline('text-generation', model=model, tokenizer=tokenizer)\n",
    "# result = generator(\"I have a dream\", do_sample=True, max_length=50)[0]['generated_text']\n",
    "# print(result)\n",
    "\n",
    "# 正确的生成方式\n",
    "inputs = tokenizer(\"I have a dream\", return_tensors=\"pt\").to(\"cuda:0\")\n",
    "outputs = model.generate(\n",
    "    input_ids=inputs.input_ids,  # 必须使用命名参数\n",
    "    attention_mask=inputs.attention_mask,\n",
    "    max_length=50,\n",
    "    do_sample=True\n",
    ")\n",
    "print(tokenizer.decode(outputs[0], skip_special_tokens=True))\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.11.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
